#PCA - determine the most useful variables
# Load necessary libraries
library(ggbiplot) # Install with devtools::install_github("vqv/ggbiplot")
library(dplyr)
library(caret)
library(ggplot2)
library(plotly)
# Load the dataset
data(iris)
iris_df <- as.data.frame(iris)
# Remove duplicate rows
iris_df <- distinct(iris_df)
# Extract only numerical variables for PCA
sampled_var <- iris_df %>% select(-Species)
# Perform PCA (center and scale the data)
pca_results <- prcomp(sampled_var, center = TRUE, scale. = TRUE)
# Summary of PCA (explains variance captured by each principal component)
summary(pca_results)
Importance of components:
PC1 PC2 PC3 PC4
Standard deviation 1.7087 0.9562 0.38111 0.1442
Proportion of Variance 0.7299 0.2286 0.03631 0.0052
Cumulative Proportion 0.7299 0.9585 0.99480 1.0000
# Scree plot - Proportion of Variance Explained (PVE)
pve <- (pca_results$sdev^2) / sum(pca_results$sdev^2)
plot(pve, xlab = "Principal Component",
ylab = "Proportion of Variance Explained (PVE)",
main = "Scree Plot",
pch = 20, cex = 2, col = "blue", type = "b")
# Cumulative Variance Explained
cpv <- cumsum(pve)
plot(cpv, xlab = "Principal Component",
ylab = "Cumulative Proportion of Variance Explained",
main = "Cumulative Variance Explained",
pch = 20, cex = 2, col = "blue", type = "b")
# Extract loading matrix (contribution of original variables to PCs)
pca_loadings <- pca_results$rotation
print(pca_loadings) # Check which variables contribute most to each PC
PC1 PC2 PC3 PC4
Sepal.Length 0.5220620 -0.37334080 0.7203991 0.2628555
Sepal.Width -0.2676921 -0.92485914 -0.2402154 -0.1235842
Petal.Length 0.5803119 -0.02431115 -0.1405854 -0.8017998
Petal.Width 0.5648277 -0.06827292 -0.6352617 0.5222557
The first PC captures the most variance in the dataset.
Interpretation: PC1 is mainly influenced by petal size (both length & width), meaning it likely captures the overall size of the flower.
# Extract PCA scores (newly projected data in PC space)
pca_scores <- pca_results$x
# Visualizing PCA - Biplot
ggbiplot(pca_results, choices = c(1,2), obs.scale = 1, var.scale = 1,
groups = iris_df$Species, ellipse = TRUE, var.axes = TRUE) +
theme_minimal() +
ggtitle("PCA Biplot of the Iris Dataset") +
scale_color_manual(values = c("setosa" = "red", "versicolor" = "blue", "virginica" = "green"))
#3D PCA
# Extract PCA scores (newly projected data in PC space)
pca_scores <- as.data.frame(pca_results$x)
# Add species labels for coloring
pca_scores$Species <- iris_df$Species
# Define colors for species
my_colors <- c("setosa" = "red", "versicolor" = "blue", "virginica" = "green")
# Create an interactive 3D PCA plot
plot_ly(pca_scores,
x = ~PC1,
y = ~PC2,
z = ~PC3,
color = ~Species,
colors = my_colors,
type = "scatter3d",
mode = "markers") %>%
layout(title = "3D PCA Visualization of Iris Dataset",
scene = list(xaxis = list(title = "PC1"),
yaxis = list(title = "PC2"),
zaxis = list(title = "PC3")))